Loading the required libraries

library(tidyverse)      # for data manipulation
library(DESeq2)         # for Differential gene expression analysis
library(ggplot2)        # for plotting
library(plotly)         # for interactive plots
library(GenomicRanges)  # for creating and manipulating genomic ranges
library(mixOmics)       # for PCA

Importing data

assay<- read.csv("assay.csv", row.names = 1)  # Importing gene expression assay data
colData <- read.csv("colData.csv")            # Importing clinical information
temp<- read.csv("rawRange.csv")               # Importing rawRanges as dataframe

rowRanges <- GRanges(                                             # creating rawRanges from dataframge
             seqnames = Rle(temp$chr),                            # Chromosome names
             ranges = IRanges(start = temp$start, end =temp$end), # Start and End positions
             strand = Rle(temp$strand),                           # Strand information (if available)
             data.frame(Gene = temp$Gene, Length = temp$length))  # Additional columns

Printing all data

print("The count data as Assay: ")
## [1] "The count data as Assay: "
head(assay)
##            HK16 HK19 HK24 HK25 HK26 WT16 WT19 WT24 WT25 WT26
## DDX11L1       0    0    0    1    0    0    0    0    0    0
## WASH7P       19   33   50   38   39   25   66   70   43   39
## MIR1302-11    0    0    1    0    0    0    0    1    0    1
## FAM138A       0    0    0    0    0    0    0    0    0    0
## OR4G4P        0    0    0    0    0    0    0    0    0    0
## OR4G11P       0    0    0    0    0    0    0    0    0    0
print("The clinical data data as colData: ")
## [1] "The clinical data data as colData: "
head(colData)
##   Sample_Name Sample_Group    Sex Stage Batch
## 1        HK16           HK   Male    HK     A
## 2        HK19           HK Female    HK     A
## 3        HK24           HK   Male    HK     A
## 4        HK25           HK Female    HK     A
## 5        HK26           HK   Male    HK     A
## 6        WT16           WT   Male  I-II     A
print("The gene coordinates as rawRanges ")
## [1] "The gene coordinates as rawRanges "
head(rowRanges)
## GRanges object with 6 ranges and 2 metadata columns:
##       seqnames      ranges strand |        Gene    Length
##          <Rle>   <IRanges>  <Rle> | <character> <integer>
##   [1]     chr1 11869-12227      + |     DDX11L1      1756
##   [2]     chr1 14363-14829      - |      WASH7P      2073
##   [3]     chr1 29554-30039      + |  MIR1302-11      1021
##   [4]     chr1 34554-35174      - |     FAM138A      1219
##   [5]     chr1 52473-53312      + |      OR4G4P       947
##   [6]     chr1 62948-63887      + |     OR4G11P       940
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Visualizing variability in library size

nreads <- data.frame(reads = colSums(assay)) %>%                   # Calculating library size by summing read counts for each sample
  rownames_to_column("sample") %>%
  mutate(Sample_Group = case_when(grepl("HK", sample) ~ "HK",
                         grepl("WT", sample) ~ "WT")) 

lsize <- ggplot(nreads, aes(x=Sample_Group, y=reads, fill=Sample_Group,label=sample)) +
  geom_boxplot() +
  geom_jitter(width = 0.25) +
  scale_fill_manual(values = c("forest green", "steel blue")) +
  theme_minimal() + 
  ggtitle("Library size")


ggplotly(lsize)

Filtering out low count genes

Filtering low-count genes is a common preprocessing step in RNA-Seq data analysis, especially when using tools like DESeq2 for differential expression analysis. Here’s why it’s important:

Low-Count Genes Provide Little Statistical Power

Genes with low counts typically have high variability due to the limited sampling of sequencing reads. This variability makes it difficult to reliably detect significant differences in expression levels between conditions.

Reduction of Multiple Testing Burden

Differential expression analysis involves statistical tests for thousands of genes. Each test adds to the multiple-testing burden, requiring more stringent p-value adjustments.

Minimizing Noise

Low-count genes are often affected by: Sequencing noise: Random fluctuations during library preparation or sequencing. Mapping artifacts: Reads mapped spuriously to regions with minimal biological relevance.

We are usnig the following filters

rowSums(Exp.data > 0) >= 2: At least 2 samples with non-zero expression.
rowSums(Exp.data) >= 20: Total expression across samples is at least 20
assay.filtered <- assay[rowSums(assay > 0) >= 2 & rowSums(assay) >= 20,]

paste0("Number of genes before filtering: ", nrow(assay))
## [1] "Number of genes before filtering: 379"
paste0("Number of genes after filtering: ", nrow(assay.filtered))
## [1] "Number of genes after filtering: 201"

Visualizing RLE

A Relative Log Expression (RLE) plot is a diagnostic tool commonly used in RNA-Seq data analysis to assess the normalization of a count matrix. It shows the log ratios of gene expression counts relative to their median across samples.

# Add pseudocount and calculate log2-transformed counts
log_counts <- log2(assay.filtered + 1)

# Calculate relative log expression
medians <- apply(log_counts, 1, median)         # Median for each gene across samples
rle <- sweep(log_counts, 1, medians, FUN = "-") # Subtract medians (log scale)

# Reshape data for ggplot
rle_long <- reshape2::melt(rle)
colnames(rle_long) <- c("Sample", "Deviation")
rle_long$Sample_group <-  rep(c("HK", "WT"), c(1005,1005))

# Plot RLE as a boxplot colored by Sample_Group
ggplot(rle_long, aes(x = Sample, y = Deviation, fill = Sample_group)) +
  geom_boxplot(outlier.size = 0.5, outlier.colour = "red") +
  scale_fill_manual(values = c("forest green", "steel blue")) +
  theme_minimal() +
  labs(
    title = "RLE Plot before Normalization",
    x = "Sample",
    y = "Relative Log Expression (Deviation)",
    fill = "Sample_group"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity

Creating DEseq2 object with filtered counts

In DESeq2, the primary data structure for storing RNA-Seq data and performing analyses is the DESeqDataSet (DDS) object. It is specifically designed to handle the raw counts and associated metadata for differential expression analysis.

What is a DESeqDataSet?

A DESeqDataSet is an object from the S4 class system in R that holds:

Count Data: The raw, unnormalized read counts for genes across samples.

Column Metadata: Information about the experimental design (e.g., condition, batch).

Row Metadata: Gene annotations or other feature-level metadata.

Design Formula: Specifies the experimental design to model differential expression (e.g., ~ condition or ~ condition + batch).

Note: DESeq2 take raw counts as input

DE.obj <- DESeqDataSetFromMatrix(countData = assay.filtered,
                                 colData   = colData,
                                 design    = ~ Sample_Group)

Estimating size factor and normalizing counts

DESeq2 performs normalization using ‘size factor’ to account for differences in sequencing depth and other systematic biases across samples in RNA-Seq datasets.

The size factor is a scaling factor used in RNA-Seq analysis to normalize read counts across samples, accounting for differences in sequencing depth or library size. This ensures that comparisons of gene expression levels are not biased by variations in the total number of reads per sample.

DE.obj <- estimateSizeFactors(DE.obj)            # Estimating size factor
 
DE.normalized <- counts(DE.obj, normalized = T)  # Normalizing counts

# Add pseudocount and calculate log2-transformed counts
log_counts <- log2(DE.normalized + 1) %>% as.data.frame()

# Calculate relative log expression
medians <- apply(log_counts, 1, median)         # Median for each gene across samples
rle <- sweep(log_counts, 1, medians, FUN = "-") # Subtract medians (log scale)

# Reshape data for ggplot
rle_long <- reshape2::melt(rle)
colnames(rle_long) <- c("Sample", "Deviation")
rle_long$Sample_group <-  rep(c("HK", "WT"), c(1005,1005))

# Plot RLE as a boxplot colored by Sample_Group
ggplot(rle_long, aes(x = Sample, y = Deviation, fill = Sample_group)) +
  geom_boxplot(outlier.size = 0.5, outlier.colour = "red") +
  scale_fill_manual(values = c("forest green", "steel blue")) +
  theme_minimal() +
  labs(
    title = "RLE Plot after Normalization",
    x = "Sample",
    y = "Relative Log Expression (Deviation)",
    fill = "Sample_group"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity

Visualizing data

Principal Component Analysis (PCA) is a dimensionality reduction technique widely used in data analysis, including RNA-Seq or other high-dimensional datasets. It transforms the data into a new coordinate system such that the greatest variance lies along the first principal component (PC1), the second greatest variance along the second component (PC2), and so on.

pca1<- pca(t(DE.normalized), center = T, scale = T)       # Performing PCA

plotIndiv(pca1, group = colData$Sample_Group, ind.names = F, 
          title = "Total expression profile",
          legend = T, legend.position = "bottom", col.per.group = c("forest green", "steel blue"))

Performing Differential expression analysis

DESeq2 performs differential expression gene (DEG) analysis using a model-based approach designed specifically for count data from RNA-Seq experiments. It employs generalized linear models (GLMs) and statistical techniques to identify genes with significant differences in expression across conditions.

Normalization

Dispersion Estimation

Model fitting

Hypothesis Testing

Multiple Testing Correction

DE.obj$Sample_Group <- relevel(DE.obj$Sample_Group, ref = "HK")  # Setting reference group

DE.obj <- DESeq(DE.obj)                                          #Fitting the model

res.con <- results(DE.obj,                                       # Extracting result table
                   contrast=c("Sample_Group", "WT", "HK"),       # Setting Contrast 
                   pAdjustMethod = "BH",                         # Using 'BH' for p-value adjustment
                   alpha = 0.1)                                  # Adjusted p-value threshold
                   
res.con <- res.con[!is.na(res.con$padj),]                        # Removing genes with NA adj p-values                  

Visualizing results

volcano <- as.data.frame(res.con) %>%
  rownames_to_column("gene") %>%
  mutate(Logpadj=-log10(padj)) %>%
  mutate(sig=case_when(log2FoldChange >= 1 & padj < 0.05 ~ "Up",
                       log2FoldChange <= -1 & padj < 0.05 ~ "Down")) %>%
  ggplot(aes(x=log2FoldChange, y=Logpadj, color=sig, label=gene)) + 
  geom_point(size=1) +
  theme_bw() + 
  ylab("-Log10(FDR)") + 
  theme(legend.position = "none") +
  scale_color_manual(values = c("firebrick", "steelblue")) +
  geom_vline(xintercept = c(-1, 1), col = "gray", linetype = 'dashed') + 
  geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
  ggtitle("Volcano DEGs")
  

ggplotly(volcano)